import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from lime.lime_tabular import LimeTabularExplainer
from xai.data.reader import read_data
from xai.models import LightGBMModel, LogisticRegressionModel
from xai.validation import HoldOutValidation
features, target = read_data('data/hotel_bookings.csv')
features.head()
validation = HoldOutValidation(test_size=0.1, random_state=42)
(X_train, y_train), (X_test, y_test) = next(validation.split(features, target))
model = LightGBMModel(n_estimators=200,
learning_rate=0.007,
max_depth=-1,
num_leaves=64,
n_jobs=4,
random_state=42)
model = model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
accuracy = np.mean(y_pred_train == y_train)
print(f'Train accuracy = {accuracy}')
y_pred_test = model.predict(X_test)
accuracy = np.mean(y_pred_test == y_test)
print(f'Test accuracy = {accuracy}')
print(f'Predykcja modelu:\t{model.predict(X_train.iloc[0:1]).values[0]}')
print(f'Prawdziwa wartość:\t{y_train[0]}')
X_train_transform = model._feature_engineering(X_train, train=False)
feature_names = list(X_train_transform.columns)
categorical_features = [feature_id for feature_id, feature in enumerate(feature_names)
if X_train_transform[feature].dtype.name=='category']
categorical_names = {}
for feature in categorical_features:
categorical_map = dict(zip(
X_train_transform.iloc[:,feature].cat.codes + 1,
X_train_transform.iloc[:,feature]
))
categorical_names[feature] = categorical_map
X_train_transform.iloc[:, feature] = X_train_transform.iloc[:, feature].cat.codes + 1
explainer = LimeTabularExplainer(X_train_transform.values,
training_labels=y_train.values,
class_names=['Accepted', 'Canceled'],
mode='classification',
verbose=True,
discretize_continuous=False,
feature_names=feature_names,
categorical_names=categorical_names,
categorical_features=categorical_features
)
def predict_fn(data):
data = data.copy()
data = pd.DataFrame(data, columns=feature_names)
for feature in categorical_features:
data.iloc[:, feature] = data.iloc[:, feature].round().astype(int).apply(
lambda name: categorical_names[feature][name]).astype('category')
_model = model.model
return _model.predict_proba(data)
print(f'Predykcja modelu:\t{model.predict(X_train.iloc[0:1]).values[0]}')
print(f'Prawdziwa wartość:\t{y_train.iloc[0]}')
exp = explainer.explain_instance(X_train_transform.iloc[0],
predict_fn=predict_fn,
num_features=5)
exp.show_in_notebook(show_table=True, show_all=False)
for i in range(4, 10):
print(f'Prawdziwa wartość:\t{y_train[i]}')
print(f'Predykcja modelu:\t{model.predict(X_train.iloc[i:(i+1)]).values[0]}')
exp = explainer.explain_instance(X_train_transform.iloc[i],
predict_fn=predict_fn,
num_features=5)
exp.show_in_notebook(show_table=True, show_all=False)
model = LogisticRegressionModel()
model = model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
accuracy = np.mean(y_pred_train == y_train)
print(f'Train accuracy = {accuracy}')
y_pred_test = model.predict(X_test)
accuracy = np.mean(y_pred_test == y_test)
print(f'Test accuracy = {accuracy}')
X_train_transform = model._feature_engineering(X_train, train=False)
feature_names = list(X_train_transform.columns)
categorical_features = [feature_id for feature_id, feature in enumerate(feature_names)
if X_train_transform[feature].dtype.name=='category']
categorical_names = {}
for feature in categorical_features:
categorical_map = dict(zip(
X_train_transform.iloc[:,feature].cat.codes + 1,
X_train_transform.iloc[:,feature]
))
categorical_names[feature] = categorical_map
X_train_transform.iloc[:, feature] = X_train_transform.iloc[:, feature].cat.codes + 1
explainer = LimeTabularExplainer(X_train_transform.values,
training_labels=y_train.values,
class_names=['Accepted', 'Canceled'],
mode='classification',
verbose=True,
discretize_continuous=False,
feature_names=feature_names,
categorical_names=categorical_names,
categorical_features=categorical_features
)
def predict_fn(data):
data = data.copy()
data = pd.DataFrame(data, columns=feature_names)
for feature in categorical_features:
data.iloc[:, feature] = data.iloc[:, feature].round().astype(int).apply(
lambda name: categorical_names[feature][name]).astype('category')
_model = model.model
return _model.predict_proba(data)
for i in range(10):
print(f'Prawdziwa wartość:\t{y_train[i]}')
print(f'Predykcja modelu:\t{model.predict(X_train.iloc[i:(i+1)]).values[0]}')
exp = explainer.explain_instance(X_train_transform.iloc[i],
predict_fn=predict_fn,
num_features=5)
exp.show_in_notebook(show_table=True, show_all=False)
country ma większą wagę, niż to było w poprzednim modelu, zapewnie z powodu użycia encodingu zmiennej o dużej liczbie wartości.¶